Goal: can machine learning methods help us to associate metabolites with leaf length?
Previously (script 11b2) I filtered out unnamed metabolites. Here I keep them all. but remove blank samples
library(glmnet)
Loading required package: Matrix
Loaded glmnet 4.1
library(relaimpo)
Loading required package: MASS
Loading required package: boot
Loading required package: survey
Loading required package: grid
Loading required package: survival
Attaching package: ‘survival’
The following object is masked from ‘package:boot’:
aml
Attaching package: ‘survey’
The following object is masked from ‘package:graphics’:
dotchart
Loading required package: mitools
This is the global version of package relaimpo.
If you are a non-US user, a version with the interesting additional metric pmvd is available
from Ulrike Groempings web site at prof.beuth-hochschule.de/groemping.
library(tidyverse)
Registered S3 methods overwritten by 'dbplyr':
method from
print.tbl_lazy
print.tbl_sql
── Attaching packages ──────────────────────────────────────────────────────── tidyverse 1.3.0 ──
✓ ggplot2 3.3.3 ✓ purrr 0.3.4
✓ tibble 3.0.6 ✓ dplyr 1.0.4
✓ tidyr 1.1.2 ✓ stringr 1.4.0
✓ readr 1.4.0 ✓ forcats 0.5.1
── Conflicts ─────────────────────────────────────────────────────────── tidyverse_conflicts() ──
x tidyr::expand() masks Matrix::expand()
x dplyr::filter() masks stats::filter()
x dplyr::lag() masks stats::lag()
x tidyr::pack() masks Matrix::pack()
x dplyr::select() masks MASS::select()
x tidyr::unpack() masks Matrix::unpack()
library(broom)
get leaflength data
leaflength <- read_csv("../../plant/output/leaf_lengths_metabolite.csv") %>%
mutate(pot=str_pad(pot, width=3, pad="0"),
sampleID=str_c("wyo", genotype, pot, sep="_"),
group=str_c(soil,genotype,trt,sep="_")) %>%
select(sampleID, group, genotype, trt, leaf_avg_std) %>%
filter(!str_detect(group, "BLANK"))
── Column specification ─────────────────────────────────────────────────────────────────────────
cols(
pot = col_double(),
soil = col_character(),
genotype = col_character(),
trt = col_character(),
leaf_avg = col_double(),
leaf_avg_std = col_double()
)
length(unique(leaflength$group))
[1] 4
leaflength %>% arrange(sampleID)
get and wrangle metabolite data
met_raw <-read_csv("../input/metabolites_set1.csv")
── Column specification ─────────────────────────────────────────────────────────────────────────
cols(
.default = col_double(),
tissue = col_character(),
soil = col_character(),
genotype = col_character(),
autoclave = col_character(),
time_point = col_character(),
concatenate = col_character()
)
ℹ Use `spec()` for the full column specifications.
met <- met_raw %>%
mutate(pot=str_pad(pot, width = 3, pad = "0")) %>%
mutate(sampleID=str_c("wyo", genotype, pot, sep="_")) %>%
filter(soil!="BLANK") %>%
select(sampleID, genotype, tissue, sample_mass = `sample_mass mg`, !submission_number:concatenate) %>%
pivot_longer(!sampleID:sample_mass, names_to = "metabolite", values_to = "met_amount") %>%
#adjust by sample mass
mutate(met_per_mg=met_amount/sample_mass) %>%
#scale and center
group_by(metabolite, genotype, tissue) %>%
mutate(met_per_mg=scale(met_per_mg),
met_amt=scale(met_amount)
) %>%
pivot_wider(id_cols = sampleID,
names_from = c(tissue, metabolite),
values_from = starts_with("met_"),
names_sep = "_")
met
split this into two data frames, one normalized by tissue amount and one not.
met_per_mg <- met %>% select(sampleID, starts_with("met_per_mg")) %>%
as.data.frame() %>% column_to_rownames("sampleID")
met_amt <- met %>% select(sampleID, starts_with("met_amt")) %>%
as.data.frame() %>% column_to_rownames("sampleID")
get leaf data order to match
leaflength <- leaflength[match(met$sampleID, leaflength$sampleID),]
leaflength
met_per_mg.leaf_PCA <- met_per_mg %>%
select(matches("_leaf_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.leaf_PCA)
[1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_per_mg.leaf_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_per_mg.leaf_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, normalized leaf metabolites")
met_per_mg.leaf_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
summarise(across(starts_with("PC"), ~ t.test(.x ~ trt)$p.value)) %>%
pivot_longer(starts_with("PC")) %>%
arrange(value)
Joining, by = "sampleID"
met_per_mg.root_PCA <- met_per_mg %>%
select(matches("_root_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.root_PCA)
[1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_per_mg.root_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_per_mg.root_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, normalized root metabolites")
met_per_mg.root_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
summarise(across(starts_with("PC"), ~ t.test(.x ~ trt)$p.value)) %>%
pivot_longer(starts_with("PC")) %>%
arrange(value)
Joining, by = "sampleID"
PC2 and marginally PC5
met_per_mg.root_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
summarise(across(starts_with("PC"), ~ t.test(.x ~ genotype)$p.value)) %>%
pivot_longer(starts_with("PC")) %>%
arrange(value)
Joining, by = "sampleID"
met_amt.leaf_PCA <- met_amt %>%
select(matches("_leaf_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.leaf_PCA)
[1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_amt.leaf_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_amt.leaf_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, raw leaf metabolites")
t.tests
met_amt.leaf_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
summarise(across(starts_with("PC"), ~ t.test(.x ~ trt)$p.value)) %>%
pivot_longer(starts_with("PC")) %>%
arrange(value)
Joining, by = "sampleID"
met_amt.leaf_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
summarise(across(starts_with("PC"), ~ t.test(.x ~ genotype)$p.value)) %>%
pivot_longer(starts_with("PC")) %>%
arrange(value)
Joining, by = "sampleID"
met_amt.root_PCA <- met_amt %>%
select(matches("_root_")) %>%
prcomp(center = FALSE, scale. = FALSE) #already centered and scaled
names(met_per_mg.root_PCA)
[1] "sdev" "rotation" "center" "scale" "x"
tibble(variance=met_amt.root_PCA$sdev^2, PC=str_c("PC",
str_pad(1:length(met_amt.root_PCA$sdev), width = 2, pad="0"))) %>%
mutate(percent_var=100*variance/sum(variance),
cumulative_var=cumsum(percent_var)) %>%
magrittr::extract(1:15,) %>%
ggplot(aes(x=PC, y=percent_var)) +
geom_col(fill="skyblue") +
geom_line(aes(y=cumulative_var), group="") +
ggtitle("percent variance explained, named, raw root metabolites")
t.test
met_amt.root_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
summarise(across(starts_with("PC"), ~ t.test(.x ~ trt)$p.value)) %>%
pivot_longer(starts_with("PC")) %>%
arrange(value)
Joining, by = "sampleID"
met_amt.root_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
summarise(across(starts_with("PC"), ~ t.test(.x ~ genotype)$p.value)) %>%
pivot_longer(starts_with("PC")) %>%
arrange(value)
Joining, by = "sampleID"
are the PCs normalized?
colMeans(met_amt.leaf_PCA$x) %>% round(3) #yes centered
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18 PC19
0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
PC20 PC21 PC22 PC23 PC24
0 0 0 0 0
apply(met_amt.leaf_PCA$x, 2, sd) %>% round(2) #not scaled
PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9 PC10 PC11 PC12 PC13 PC14 PC15 PC16
11.89 9.03 6.73 6.47 5.80 5.78 5.37 5.08 4.58 4.40 4.36 4.35 4.03 3.83 3.76 3.64
PC17 PC18 PC19 PC20 PC21 PC22 PC23 PC24
3.55 3.46 3.30 3.19 3.14 3.06 0.00 0.00
combine the leaf and root, and then scale them:
met_per_mg.leaf_PCs <- met_per_mg.leaf_PCA$x
colnames(met_per_mg.leaf_PCs) <- str_c("leaf_", colnames(met_per_mg.leaf_PCs))
met_per_mg.root_PCs <- met_per_mg.root_PCA$x
colnames(met_per_mg.root_PCs) <- str_c("root_", colnames(met_per_mg.root_PCs))
met_per_mg.PCs <- cbind(met_per_mg.leaf_PCs, met_per_mg.root_PCs) %>%
scale()
met_amt.leaf_PCs <- met_amt.leaf_PCA$x
colnames(met_amt.leaf_PCs) <- str_c("leaf_", colnames(met_amt.leaf_PCs))
met_amt.root_PCs <- met_amt.root_PCA$x
colnames(met_amt.root_PCs) <- str_c("root_", colnames(met_amt.root_PCs))
met_amt.PCs <- cbind(met_amt.leaf_PCs, met_amt.root_PCs) %>%
scale()
also combine the rotations
met_per_mg.leaf_rotation <- met_per_mg.leaf_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("leaf_", .x)) %>%
rownames_to_column("metabolite")
met_per_mg.root_rotation <- met_per_mg.root_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("root_", .x)) %>%
rownames_to_column("metabolite")
met_per_mg.PC_rotation <- full_join(met_per_mg.leaf_rotation, met_per_mg.root_rotation, by="metabolite")
met_amt.leaf_rotation <- met_amt.leaf_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("leaf_", .x)) %>%
rownames_to_column("metabolite")
met_amt.root_rotation <- met_amt.root_PCA$rotation %>%
as.data.frame() %>%
rename_with(~ str_c("root_", .x)) %>%
rownames_to_column("metabolite")
met_amt.PC_rotation <- full_join(met_amt.leaf_rotation, met_amt.root_rotation, by="metabolite")
met_per_mg_fit1LOO <- cv.glmnet(x=met_per_mg.PCs, y=leaflength$leaf_avg_std, nfolds = nrow(met_per_mg.PCs), alpha=1 )
Option grouped=FALSE enforced in cv.glmnet, since < 3 observations per fold
plot(met_per_mg_fit1LOO)
bestlam=met_per_mg_fit1LOO$lambda.1se
NEXT STEP: Do a K-fold CV, repeat many times and average. Might as well do alpha while we are at it. If we are doing alpha, then we need to manually create our own folds list for each run
Fit 101 CVs for each of 11 alphas
set.seed(1245)
folds <- tibble(run=1:101) %>%
mutate(folds=map(run, ~ sample(rep(1:4,6))))
system.time (met_per_mg_multiCV <- expand_grid(run=1:100, alpha=round(seq(0,1,.1),1)) %>%
left_join(folds, by="run") %>%
mutate(fit=map2(folds, alpha, ~ cv.glmnet(x=met_per_mg.PCs, y=leaflength$leaf_avg_std, foldid = .x, alpha=.y
)))
#, lambda=exp(seq(-5,0,length.out = 50)) )))
) #100 seconds
user system elapsed
57.212 2.016 71.561
head(met_per_mg_multiCV)
for each fit, pull out the mean cv error, lambda, min lambda, and 1se lambda
met_per_mg_multiCV <- met_per_mg_multiCV %>%
mutate(cvm=map(fit, magrittr::extract("cvm")),
lambda=map(fit, magrittr::extract("lambda")),
lambda.min=map_dbl(fit, magrittr::extract("lambda.min" )),
lambda.1se=map_dbl(fit, magrittr::extract("lambda.1se")),
nzero=map(fit, magrittr::extract("nzero"))
)
head(met_per_mg_multiCV)
now calculate the mean and sem of cvm and min,1se labmdas. These need to be done separately because of the way the grouping works
met_per_mg_summary_cvm <- met_per_mg_multiCV %>% dplyr::select(-fit, -folds) %>%
unnest(c(cvm, lambda)) %>%
group_by(alpha, lambda) %>%
summarize(meancvm=mean(cvm), sem=sd(cvm)/sqrt(n()), high=meancvm+sem, low=meancvm-sem)
`summarise()` has grouped output by 'alpha'. You can override using the `.groups` argument.
met_per_mg_summary_cvm
met_per_mg_summary_lambda <- met_per_mg_multiCV %>% dplyr::select(-fit, -folds, -cvm) %>%
group_by(alpha) %>%
summarize(
lambda.min.sd=sd(lambda.min),
lambda.min.mean=mean(lambda.min),
#lambda.min.med=median(lambda.min),
lambda.min.high=lambda.min.mean+lambda.min.sd,
#lambda.min.low=lambda.min.mean-lambda.min.sem,
#lambda.1se.sem=sd(lambda.1se)/sqrt(n()),
lambda.1se.mean=mean(lambda.1se),
#lambda.1se.med=median(lambda.1se),
#lambda.1se.high=lambda.1se+lambda.1se.sem,
#lambda.1se.low=lambda.1se-lambda.1se.sem,
nzero=nzero[1],
lambda=lambda[1]
)
met_per_mg_summary_lambda
plot it
met_per_mg_summary_cvm %>%
#filter(alpha!=0) %>% # worse than everything else and throwing the plots off
ggplot(aes(x=log(lambda), y= meancvm, ymin=low, ymax=high)) +
geom_ribbon(alpha=.25) +
geom_line(aes(color=as.character(alpha))) +
facet_wrap(~ as.character(alpha)) +
coord_cartesian(xlim=(c(-5,0))) +
geom_vline(aes(xintercept=log(lambda.min.mean)), alpha=.5, data=met_per_mg_summary_lambda) +
geom_vline(aes(xintercept=log(lambda.min.high)), alpha=.5, data=met_per_mg_summary_lambda, color="blue")
So overall these look more reasonable than the LOO plot.
Make a plot of MSE at minimum lambda for each alpha
met_per_mg_summary_cvm %>%
group_by(alpha) %>%
filter(rank(meancvm, ties.method = "first")==1) %>%
ggplot(aes(x=alpha,y=meancvm,ymin=low,ymax=high)) +
geom_ribbon(color=NA, fill="gray80") +
geom_line() +
geom_point()
not a particular large difference there, aside from 0.1 and even then, not too much better.
Plot the number of nzero coefficients
met_per_mg_summary_lambda %>%
unnest(c(lambda, nzero)) %>%
group_by(alpha) %>%
filter(abs(lambda.min.mean-lambda)==min(abs(lambda.min.mean-lambda)) ) %>%
ungroup() %>%
ggplot(aes(x=as.character(alpha), y=nzero)) +
geom_point() +
ggtitle("Number of non-zero coefficents at minimum lambda") +
ylim(0,24)
OK let’s do repeated test train starting from these CV lambdas
multi_tt <- function(lambda, alpha, n=10000, sample_size=24, train_size=20, x, y=leaflength$leaf_avg_std) {
print(lambda)
print(alpha)
tt <-
tibble(run=1:n) %>%
mutate(train=map(run, ~ sample(1:sample_size, train_size))) %>%
mutate(fit=map(train, ~ glmnet(x=x[.,], y=y[.], lambda = lambda, alpha = alpha ))) %>%
mutate(pred=map2(fit, train, ~ predict(.x, newx = x[-.y,]))) %>%
mutate(cor=map2_dbl(pred, train, ~ cor(.x, y[-.y]) )) %>%
mutate(MSE=map2_dbl(pred, train, ~ mean((y[-.y] - .x)^2))) %>%
summarize(
num_na=sum(is.na(cor)),
num_lt_0=sum(cor<=0, na.rm=TRUE),
avg_cor=mean(cor, na.rm=TRUE),
avg_MSE=mean(MSE))
tt
}
per_mg_fit_test_train <- met_per_mg_summary_lambda %>%
select(alpha, lambda.min.mean)
per_mg_fit_test_train <- met_per_mg_multiCV %>%
filter(run==1) %>%
select(alpha, fit) %>%
right_join(per_mg_fit_test_train)
Joining, by = "alpha"
per_mg_fit_test_train <- per_mg_fit_test_train %>%
mutate(pred_full=map2(fit, lambda.min.mean, ~ predict(.x, s=.y, newx=met_per_mg.PCs)),
full_R=map_dbl(pred_full, ~ cor(.x, leaflength$leaf_avg_std)),
full_MSE=map_dbl(pred_full, ~ mean((leaflength$leaf_avg_std-.x)^2))) %>%
mutate(tt=map2(lambda.min.mean, alpha, ~ multi_tt(lambda=.x, alpha=.y, x=met_per_mg.PCs)))
[1] 63.76959
[1] 0
[1] 3.23426
[1] 0.1
[1] 1.96389
[1] 0.2
[1] 1.390028
[1] 0.3
[1] 1.057486
[1] 0.4
[1] 0.8344099
[1] 0.5
[1] 0.7009496
[1] 0.6
[1] 0.5978767
[1] 0.7
[1] 0.527807
[1] 0.8
[1] 0.4681233
[1] 0.9
[1] 0.4171747
[1] 1
(per_mg_fit_test_train <- per_mg_fit_test_train %>% unnest(tt))
per_mg_fit_test_train %>%
ggplot(aes(x=alpha)) +
geom_line(aes(y=avg_cor), color="red") +
geom_point(aes(y=avg_cor), color="red") +
geom_line(aes(y=avg_MSE), color="blue") +
geom_point(aes(y=avg_MSE), color="blue")
all are similar from 0.2 onwards
alpha_per_mg <- .2
best_per_mg <- per_mg_fit_test_train %>% filter(alpha == alpha_per_mg)
best_per_mg_fit <- best_per_mg$fit[[1]]
best_per_mg_lambda <- best_per_mg$lambda.min.mean
per_mg_coef.tb <- coef(best_per_mg_fit, s=best_per_mg_lambda) %>%
as.matrix() %>% as.data.frame() %>%
rownames_to_column(var="PC") %>%
rename(beta=`1`)
per_mg_coef.tb %>% filter(beta!=0) %>% arrange(beta)
NA
pred and obs
plot(leaflength$leaf_avg_std, best_per_mg$pred_full[[1]])
cor.test(leaflength$leaf_avg_std, best_per_mg$pred_full[[1]]) #.65
Pearson's product-moment correlation
data: leaflength$leaf_avg_std and best_per_mg$pred_full[[1]]
t = 4.0007, df = 22, p-value = 0.0006021
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.3326307 0.8340147
sample estimates:
cor
0.64895
best_per_mg$full_MSE
[1] 0.8477346
per_mg_vars <- per_mg_coef.tb %>%
filter(beta !=0, PC!="(Intercept)") %>%
pull(PC) %>% c("leaf_avg_std", .)
per_mg_relimp <- leaflength %>% select(leaf_avg_std) %>% cbind(met_per_mg.PCs) %>% as.data.frame() %>% dplyr::select(all_of(per_mg_vars)) %>%
calc.relimp()
per_mg_coef.tb <- per_mg_relimp@lmg %>% as.matrix() %>% as.data.frame() %>%
rownames_to_column("PC") %>%
rename(PropVar_met_per_mg=V1) %>%
full_join(per_mg_coef.tb) %>%
arrange(desc(PropVar_met_per_mg))
Joining, by = "PC"
per_mg_coef.tb
NA
lm with gt and trt
lmtest <- met_per_mg.leaf_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
pivot_longer(cols=starts_with("PC"), names_to = "PC") %>%
mutate(PC=str_c("leaf_", PC)) %>%
group_by(PC) %>%
nest() %>%
mutate(lm_add=map(data, ~ lm(value ~ genotype + trt, data=.)),
lm_int=map(data, ~ lm(value ~ genotype*trt, data=.)))
Joining, by = "sampleID"
lmtest %>% mutate(broomtidy = map(lm_add, tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
lmtest %>% mutate(broomtidy = map(lm_int, broom::tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
lmtest <- met_per_mg.root_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
pivot_longer(cols=starts_with("PC"), names_to = "PC") %>%
mutate(PC=str_c("root_", PC)) %>%
group_by(PC) %>%
nest() %>%
mutate(lm_add=map(data, ~ lm(value ~ genotype + trt, data=.)),
lm_int=map(data, ~ lm(value ~ genotype*trt, data=.)))
Joining, by = "sampleID"
lmtest %>% mutate(broomtidy = map(lm_add, tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
lmtest %>% mutate(broomtidy = map(lm_int, broom::tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
Checkout the rotations.
met_per_mg_rotation_out <- met_per_mg.PC_rotation %>%
pivot_longer(-metabolite, names_to="PC", values_to="loading") %>%
filter(PC %in% filter(per_mg_coef.tb, beta!=0)$PC ) %>%
group_by(PC) %>%
filter(!str_detect(metabolite,".*(leaf|root)_[0-9]*$")) %>%
filter(abs(loading) >= 0.05) %>%
left_join(per_mg_coef.tb, by="PC") %>%
arrange(desc(abs(beta)), desc(abs(loading))) %>%
mutate(organ=ifelse(str_detect(metabolite, "_leaf_"), "leaf", "root"),
transformation="normalized",
metabolite=str_remove(metabolite, "met_per_mg_(root|leaf)_"),
metabolite_effect_on_leaf=ifelse(beta*loading>0, "increase", "decrease"))
met_per_mg_rotation_out %>% write_csv("../output/Leaf_associated_metabolites_normalized_NOBLANK.csv")
met_per_mg_rotation_out
Fit 101 CVs for each of 11 alphas
set.seed(1245)
folds <- tibble(run=1:101) %>%
mutate(folds=map(run, ~ sample(rep(1:6,4))))
system.time (met_amt_multiCV <- expand_grid(run=1:100, alpha=round(seq(0,1,.1),1)) %>%
left_join(folds, by="run") %>%
mutate(fit=map2(folds, alpha, ~ cv.glmnet(x=met_amt.PCs, y=leaflength$leaf_avg_std, foldid = .x, alpha=.y
)))
#, lambda=exp(seq(-5,0,length.out = 50)) )))
) #100 seconds
user system elapsed
68.443 2.182 78.396
head(met_amt_multiCV)
for each fit, pull out the mean cv error, lambda, min lambda, and 1se lambda
met_amt_multiCV <- met_amt_multiCV %>%
mutate(cvm=map(fit, magrittr::extract("cvm")),
lambda=map(fit, magrittr::extract("lambda")),
lambda.min=map_dbl(fit, magrittr::extract("lambda.min" )),
lambda.1se=map_dbl(fit, magrittr::extract("lambda.1se")),
nzero=map(fit, magrittr::extract("nzero"))
)
head(met_amt_multiCV)
now calculate the mean and sem of cvm and min,1se labmdas. These need to be done separately because of the way the grouping works
met_amt_summary_cvm <- met_amt_multiCV %>% dplyr::select(-fit, -folds) %>%
unnest(c(cvm, lambda)) %>%
group_by(alpha, lambda) %>%
summarize(meancvm=mean(cvm), sem=sd(cvm)/sqrt(n()), high=meancvm+sem, low=meancvm-sem)
`summarise()` has grouped output by 'alpha'. You can override using the `.groups` argument.
met_amt_summary_cvm
met_amt_summary_lambda <- met_amt_multiCV %>% dplyr::select(-fit, -folds, -cvm) %>%
group_by(alpha) %>%
summarize(
lambda.min.sd=sd(lambda.min),
lambda.min.mean=mean(lambda.min),
#lambda.min.med=median(lambda.min),
lambda.min.high=lambda.min.mean+lambda.min.sd,
#lambda.min.low=lambda.min.mean-lambda.min.sem,
#lambda.1se.sem=sd(lambda.1se)/sqrt(n()),
lambda.1se.mean=mean(lambda.1se),
#lambda.1se.med=median(lambda.1se),
#lambda.1se.high=lambda.1se+lambda.1se.sem,
#lambda.1se.low=lambda.1se-lambda.1se.sem,
nzero=nzero[1],
lambda=lambda[1]
)
met_amt_summary_lambda
plot it
met_amt_summary_cvm %>%
#filter(alpha!=0) %>% # worse than everything else and throwing the plots off
ggplot(aes(x=log(lambda), y= meancvm, ymin=low, ymax=high)) +
geom_ribbon(alpha=.25) +
geom_line(aes(color=as.character(alpha))) +
facet_wrap(~ as.character(alpha)) +
coord_cartesian(xlim=(c(-5,0))) +
geom_vline(aes(xintercept=log(lambda.min.mean)), alpha=.5, data=met_amt_summary_lambda) +
geom_vline(aes(xintercept=log(lambda.min.high)), alpha=.5, data=met_amt_summary_lambda, color="blue")
Make a plot of MSE at minimum lambda for each alpha
met_amt_summary_cvm %>%
group_by(alpha) %>%
filter(rank(meancvm, ties.method = "first")==1) %>%
ggplot(aes(x=alpha,y=meancvm,ymin=low,ymax=high)) +
geom_ribbon(color=NA, fill="gray80") +
geom_line() +
geom_point()
decreasing throughout, although differences are pretty small
Plot the number of nzero coefficients
met_amt_summary_lambda %>%
unnest(c(lambda, nzero)) %>%
group_by(alpha) %>%
filter(abs(lambda.min.mean-lambda)==min(abs(lambda.min.mean-lambda)) ) %>%
ungroup() %>%
ggplot(aes(x=as.character(alpha), y=nzero)) +
geom_point() +
ggtitle("Number of non-zero coefficents at minimum lambda") +
ylim(0,36)
OK let’s do repeated test train starting from these CV lambdas
multi_tt <- function(lambda, alpha, n=10000, sample_size=24, train_size=20, x, y=leaflength$leaf_avg_std) {
print(lambda)
print(alpha)
tt <-
tibble(run=1:n) %>%
mutate(train=map(run, ~ sample(1:sample_size, train_size))) %>%
mutate(fit=map(train, ~ glmnet(x=x[.,], y=y[.], lambda = lambda, alpha = alpha ))) %>%
mutate(pred=map2(fit, train, ~ predict(.x, newx = x[-.y,]))) %>%
mutate(cor=map2_dbl(pred, train, ~ cor(.x, y[-.y]) )) %>%
mutate(MSE=map2_dbl(pred, train, ~ mean((y[-.y] - .x)^2))) %>%
summarize(
num_na=sum(is.na(cor)),
num_lt_0=sum(cor<=0, na.rm=TRUE),
avg_cor=mean(cor, na.rm=TRUE),
avg_MSE=mean(MSE))
tt
}
amt_fit_test_train <- met_amt_summary_lambda %>%
select(alpha, lambda.min.mean)
amt_fit_test_train <- met_amt_multiCV %>%
filter(run==1) %>%
select(alpha, fit) %>%
right_join(amt_fit_test_train)
Joining, by = "alpha"
amt_fit_test_train <- amt_fit_test_train %>%
mutate(pred_full=map2(fit, lambda.min.mean, ~ predict(.x, s=.y, newx=met_amt.PCs)),
full_R=map_dbl(pred_full, ~ cor(.x, leaflength$leaf_avg_std)),
full_MSE=map_dbl(pred_full, ~ mean((leaflength$leaf_avg_std-.x)^2))) %>%
mutate(tt=map2(lambda.min.mean, alpha, ~ multi_tt(lambda=.x, alpha=.y, x=met_amt.PCs)))
[1] 129.3889
[1] 0
[1] 2.394178
[1] 0.1
[1] 1.622926
[1] 0.2
[1] 1.129161
[1] 0.3
[1] 0.8782994
[1] 0.4
[1] 0.709034
[1] 0.5
[1] 0.5979773
[1] 0.6
[1] 0.5183774
[1] 0.7
[1] 0.4567253
[1] 0.8
[1] 0.4084716
[1] 0.9
[1] 0.3686524
[1] 1
(amt_fit_test_train <- amt_fit_test_train %>% unnest(tt))
amt_fit_test_train %>%
ggplot(aes(x=alpha)) +
geom_line(aes(y=avg_cor), color="red") +
geom_point(aes(y=avg_cor), color="red") +
geom_line(aes(y=avg_MSE), color="blue") +
geom_point(aes(y=avg_MSE), color="blue")
not a big difference after 0.2
alpha_amt <- .7
best_amt <- amt_fit_test_train %>% filter(alpha == alpha_amt)
best_amt_fit <- best_amt$fit[[1]]
best_amt_lambda <- best_amt$lambda.min.mean
amt_coef.tb <- coef(best_amt_fit, s=best_amt_lambda) %>%
as.matrix() %>% as.data.frame() %>%
rownames_to_column(var="PC") %>%
rename(beta=`1`)
amt_coef.tb %>% filter(beta!=0) %>% arrange(beta)
NA
pred and obs
plot(leaflength$leaf_avg_std, best_amt$pred_full[[1]])
cor.test(leaflength$leaf_avg_std, best_amt$pred_full[[1]]) #.64
Pearson's product-moment correlation
data: leaflength$leaf_avg_std and best_amt$pred_full[[1]]
t = 3.9359, df = 22, p-value = 0.000705
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.3232239 0.8307777
sample estimates:
cor
0.6428067
best_amt$full_MSE
[1] 0.6804825
amt_vars <- amt_coef.tb %>%
filter(beta !=0, PC!="(Intercept)") %>%
pull(PC) %>% c("leaf_avg_std", .)
# because there is only one explanatory variable, we don't need to (and can't) use relimp
amt_relimp <- leaflength %>% select(leaf_avg_std) %>% cbind(met_amt.PCs) %>% as.data.frame() %>% dplyr::select(all_of(amt_vars)) %>% cor() %>% magrittr::extract(1,2) %>% magrittr::multiply_by(.,.)
amt_relimp <- tibble(PC=str_subset(amt_vars, "PC"),
PropVar_met_amt=amt_relimp)
amt_coef.tb <- amt_relimp %>%
full_join(amt_coef.tb) %>%
arrange(desc(PropVar_met_amt))
Joining, by = "PC"
amt_coef.tb
NA
lm with gt and trt
lmtest <- met_amt.leaf_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
pivot_longer(cols=starts_with("PC"), names_to = "PC") %>%
mutate(PC=str_c("leaf_", PC)) %>%
group_by(PC) %>%
nest() %>%
mutate(lm_add=map(data, ~ lm(value ~ genotype + trt, data=.)),
lm_int=map(data, ~ lm(value ~ genotype*trt, data=.)))
Joining, by = "sampleID"
lmtest %>% mutate(broomtidy = map(lm_add, tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
lmtest %>% mutate(broomtidy = map(lm_int, broom::tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
lmtest <- met_amt.root_PCA$x %>%
as.data.frame() %>%
rownames_to_column("sampleID") %>%
left_join(leaflength) %>%
select(sampleID, genotype, trt, starts_with("PC")) %>%
pivot_longer(cols=starts_with("PC"), names_to = "PC") %>%
mutate(PC=str_c("root_", PC)) %>%
group_by(PC) %>%
nest() %>%
mutate(lm_add=map(data, ~ lm(value ~ genotype + trt, data=.)),
lm_int=map(data, ~ lm(value ~ genotype*trt, data=.)))
Joining, by = "sampleID"
lmtest %>% mutate(broomtidy = map(lm_add, tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
lmtest %>% mutate(broomtidy = map(lm_int, broom::tidy)) %>%
unnest(broomtidy) %>%
select(PC, term, p.value) %>%
filter(! str_detect(term, "Intercept"),
p.value < 0.1) %>%
arrange(term, p.value)
Checkout the rotations.
met_amt_rotation_out <- met_amt.PC_rotation %>%
pivot_longer(-metabolite, names_to="PC", values_to="loading") %>%
filter(PC %in% filter(amt_coef.tb, beta!=0)$PC ) %>%
group_by(PC) %>%
filter(!str_detect(metabolite,".*(leaf|root)_[0-9]*$")) %>%
filter(abs(loading) >= 0.05) %>%
left_join(amt_coef.tb, by="PC") %>%
arrange(desc(abs(beta)), desc(abs(loading))) %>%
mutate(organ=ifelse(str_detect(metabolite, "_leaf_"), "leaf", "root"),
transformation="raw",
metabolite=str_remove(metabolite, "met_amt_(root|leaf)_"),
metabolite_effect_on_leaf=ifelse(beta*loading>0, "increase", "decrease"))
met_amt_rotation_out %>% write_csv("../output/Leaf_associated_metabolites_raw_NOBLANK.csv")
met_amt_rotation_out